package edu.ufrj.pesc.pdfsearch;

import java.io.File;
import java.io.IOException;
import java.util.Vector;

import org.apache.tika.Tika;

public class PDFSearchCrawler {

	private static Tika tika;
	
	public PDFSearchCrawler() {
		tika = new Tika();
	}
	
	/**
	 * Crawls a file/file system for PDF files
	 * @param f File to be processed
	 * @param pdfs Vector<File> for recursive crawling
	 */
    public Vector<File> crawl(File f, Vector<File> pdfs) {
    	if (f.isDirectory()) {
        	// recursive case: directory
        	File[] subFiles = f.listFiles();
        	for (int i = 0; i < subFiles.length; i++) {
        		crawl(subFiles[i], pdfs);
        	}
        } else {
			try {
				if (tika.detect(f).contentEquals("application/pdf"))
				{
					pdfs.add(f);
					System.out.println(f.getName()+" found. Added to list.");
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
    	return pdfs;
    }
	
}
